library(caret)
library(corrplot)
library(GGally)
library(tidyverse)
library(car)
library(glmnet) # lasso regression
library(ggthemes)
library(vcd)
library(viridis)
library(rpart)
library(rpart.plot)
library(leaps) # Forward selection
library(boot) # bootstrap
library(lmboot) # bootstrap models
We would like to predict medical costs for insurance claims.
We need to create a interpretable model
Develop a model that can predict the best and do well on future data
# Set the seed for reproducibility
set.seed(123)
# Load data
insurance_data <- read.csv("../raw_data/insurance.csv")
# Check for missing data
sum(is.na(insurance_data)) # No missing data
## [1] 0
# Create BMI categories
# insurance_data$bmi_category <- cut(insurance_data$bmi,
# breaks = c(-Inf, 18.5, 24.9, 29.9, Inf),
# labels = c("Underweight", "Normal weight", "Overweight", "Obesity"))
# Create North or South variable
# insurance_data$NORS = NA
# insurance_data$NORS = ifelse(grepl("south", insurance_data$region, ignore.case = TRUE), "south", "north")
# Convert appropriate columns to factors
convert_chr_to_factor <- function(df) {
df[] <- lapply(df, function(col) {
if (is.character(col)) {
return(as.factor(col))
}
return(col)
})
return(df)
}
insurance_data <- convert_chr_to_factor(insurance_data)
# Determine the size of the dataset and the training set
n <- nrow(insurance_data)
train_size <- floor(0.7 * n)
# Get random indices for the training set
train_indices <- sample(seq_len(n), size = train_size)
# Split the data into training and test sets
train_data <- insurance_data[train_indices, ]
test_data <- insurance_data[-train_indices, ]
insurance_data <- train_data # renaming as insurance_data to make code easier to understand
head(insurance_data)
## age sex bmi children smoker region charges
## 415 19 female 35.150 0 no northwest 2134.901
## 463 62 female 38.095 2 no northeast 15230.324
## 179 46 female 28.900 2 no southwest 8823.279
## 526 18 female 33.880 0 no southeast 11482.635
## 195 18 male 34.430 0 no southeast 1137.470
## 938 39 female 24.225 5 no northwest 8965.796
str(insurance_data)
## 'data.frame': 936 obs. of 7 variables:
## $ age : int 19 62 46 18 18 39 41 62 20 24 ...
## $ sex : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 2 2 2 ...
## $ bmi : num 35.1 38.1 28.9 33.9 34.4 ...
## $ children: int 0 2 2 0 0 5 3 0 0 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 2 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 2 1 4 3 3 2 4 3 4 1 ...
## $ charges : num 2135 15230 8823 11483 1137 ...
summary(insurance_data)
## age sex bmi children smoker
## Min. :18.00 female:448 Min. :15.96 Min. :0.000 no :733
## 1st Qu.:27.00 male :488 1st Qu.:26.48 1st Qu.:0.000 yes:203
## Median :39.00 Median :30.78 Median :1.000
## Mean :39.26 Mean :30.87 Mean :1.121
## 3rd Qu.:51.00 3rd Qu.:34.80 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## region charges
## northeast:228 Min. : 1136
## northwest:227 1st Qu.: 4910
## southeast:250 Median : 9623
## southwest:231 Mean :13517
## 3rd Qu.:17096
## Max. :63770
# Define Freedman-Diaconis bin width calculation
freedman_diaconis_bin_width <- function(data) {
iqr_val <- IQR(data, na.rm = TRUE)
n <- length(data)
2 * iqr_val / (n^(1/3))
}
# Univariate Analysis
for (column_name in names(insurance_data)) {
column_data <- insurance_data[[column_name]]
if (is.factor(column_data)) {
# Categorical Variable Analysis using ggplot2
cat_plot <- ggplot(insurance_data, aes(x=column_data)) +
geom_bar(aes(fill=column_data), color="black") +
labs(title=paste("Distribution of", column_name), x=column_name) +
theme_light() +
scale_fill_brewer(palette="Set2")
print(cat_plot)
} else {
# Continuous Variable Analysis using ggplot2
bin_width_fd <- freedman_diaconis_bin_width(column_data)
hist_plot <- ggplot(insurance_data, aes(x=column_data)) +
geom_histogram(binwidth=bin_width_fd, fill="blue", color="black", alpha=0.7) +
labs(title=paste("Distribution of", column_name), x=column_name) +
theme_light()
box_plot <- ggplot(insurance_data, aes(y=column_data)) +
geom_boxplot(fill="blue", color="black", alpha=0.7) +
labs(title=paste("Boxplot of", column_name), y=column_name) +
theme_light()
print(hist_plot)
print(box_plot)
}
}
# Identify continuous columns
continuous_columns <- names(insurance_data)[sapply(insurance_data, function(x) !is.factor(x))]
for (col1 in continuous_columns) {
for (col2 in continuous_columns) {
if (col1 != col2) {
# Scatter plot with color based on values of col1
scatter_plot <- ggplot(insurance_data, aes_string(x=col1, y=col2)) +
geom_point(aes_string(color=col1), alpha=0.5) +
labs(title=paste("Scatter plot of", col1, "vs.", col2)) +
theme_light() +
scale_color_viridis_c() # Add a color scale
print(scatter_plot)
# Print correlation
correlation <- cor(insurance_data[[col1]], insurance_data[[col2]], use="complete.obs")
cat(paste("Correlation between", col1, "and", col2, "is:", round(correlation, 2)), "\n")
}
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Correlation between age and bmi is: 0.09
## Correlation between age and children is: 0.03
## Correlation between age and charges is: 0.26
## Correlation between bmi and age is: 0.09
## Correlation between bmi and children is: 0
## Correlation between bmi and charges is: 0.19
## Correlation between children and age is: 0.03
## Correlation between children and bmi is: 0
## Correlation between children and charges is: 0.08
## Correlation between charges and age is: 0.26
## Correlation between charges and bmi is: 0.19
## Correlation between charges and children is: 0.08
# Identify categorical columns
categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor)]
for (col1 in continuous_columns) {
for (col2 in categorical_columns) {
# Box plot
box_plot <- ggplot(insurance_data, aes_string(x=col2, y=col1)) +
geom_boxplot(aes(fill=col2)) +
labs(title=paste("Boxplot of", col1, "by", col2)) +
theme_light() +
scale_fill_viridis(discrete=TRUE)
print(box_plot)
}
}
for (col1 in categorical_columns) {
for (col2 in categorical_columns) {
if (col1 != col2) {
# Contingency table
contingency_table <- table(insurance_data[[col1]], insurance_data[[col2]])
cat("\nContingency table for", col1, "vs.", col2, ":\n")
print(contingency_table)
}
}
}
##
## Contingency table for sex vs. smoker :
##
## no yes
## female 369 79
## male 364 124
##
## Contingency table for sex vs. region :
##
## northeast northwest southeast southwest
## female 105 117 114 112
## male 123 110 136 119
##
## Contingency table for smoker vs. sex :
##
## female male
## no 369 364
## yes 79 124
##
## Contingency table for smoker vs. region :
##
## northeast northwest southeast southwest
## no 176 180 189 188
## yes 52 47 61 43
##
## Contingency table for region vs. sex :
##
## female male
## northeast 105 123
## northwest 117 110
## southeast 114 136
## southwest 112 119
##
## Contingency table for region vs. smoker :
##
## no yes
## northeast 176 52
## northwest 180 47
## southeast 189 61
## southwest 188 43
for (col1 in continuous_columns) {
for (col2 in continuous_columns) {
if (col1 != col2) {
for (cat_col in categorical_columns) {
# Scatter plot with color based on categorical column
scatter_plot <- ggplot(insurance_data, aes_string(x=col1, y=col2)) +
geom_point(aes_string(color=cat_col), alpha=0.5) +
labs(title=paste("Scatter plot of", col1, "vs.", col2, "colored by", cat_col)) +
theme_light() +
scale_color_brewer(palette="Set1")
print(scatter_plot)
}
}
}
}
plot <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se=FALSE) + # Linear regression per group
labs(title="Relationship between Age and Charges by Smoking Status") +
theme_light() +
scale_color_manual(values=c("red", "blue"))
print(plot)
## `geom_smooth()` using formula = 'y ~ x'
categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor) & names(insurance_data) != "smoker"]
for (cat_col in categorical_columns) {
plot <- ggplot(insurance_data, aes(x=age, y=charges, color=insurance_data[[cat_col]])) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se=FALSE) +
labs(title=paste("Relationship between Age and Charges by", cat_col)) +
theme_light()
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Fit the decision tree model
fit <- rpart(charges ~ ., data=insurance_data, method="anova")
# Plot the decision tree
rpart.plot(fit, yesno=2, type=3, box.palette="RdBu", fallen.leaves=TRUE)
# create group variable based off decision tree
insurance_data$group <- with(insurance_data, ifelse(smoker == "no" & age < 43, "non-smoker & age<43",
ifelse(smoker == "no" & age >= 43, "non-smoker & age>=43",
ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30",
"smoker & bmi>=30"))))
insurance_data$group2 <- with(insurance_data, ifelse(smoker == "no" & bmi < 30, "non-smoker & bmi<30",
ifelse(smoker == "no" & bmi >= 30, "non-smoker & bmi>=30",
ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30",
"smoker & bmi>=30"))))
insurance_data$group3 <- with(insurance_data, ifelse(smoker == "no" & age < 43 & children == 0, "non-smoker & age<43 & 0children",
ifelse(smoker == "no" & age < 43 & children != 0, "non-smoker & age<43 & 0children",
ifelse(smoker == "no" & age >= 43, "non-smoker & age>=43",
ifelse(smoker == "yes" & bmi < 30, "smoker & bmi<30",
"smoker & bmi>=30")))))
insurance_data$group <- as.factor(insurance_data$group)
insurance_data$group2 <- as.factor(insurance_data$group2)
insurance_data$group3 <- as.factor(insurance_data$group3)
# Identify potential categorical columns for faceting
faceting_columns <- names(insurance_data)[sapply(insurance_data, function(x) is.factor(x))]
for (facet_var in faceting_columns) {
plot <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
geom_point(alpha=0.6) +
facet_wrap(as.formula(paste("~", facet_var))) +
labs(title=paste("Age vs. Charges by Smoker Status, Faceted by", facet_var)) +
theme_light()
print(plot)
}
p <- ggplot(insurance_data, aes(x=age, y=charges, color=smoker)) +
geom_point(alpha=0.6) +
facet_wrap(~ group, scales="free_y") + # Facet by the new group
labs(title="Scatterplot of Age vs. Charges by Groups based on Tree Splits") +
theme_light()
print(p)
# Fit the decision tree model
fit <- rpart(charges ~ ., data=insurance_data, method="anova", control=rpart.control(cp=0.001, maxdepth=3))
# Plot the decision tree
rpart.plot(fit, yesno=2, type=3, box.palette="RdBu", fallen.leaves=TRUE)
categorical_columns <- names(insurance_data)[sapply(insurance_data, is.factor) & names(insurance_data) != "smoker"]
for (cat_col in categorical_columns) {
plot <- ggplot(insurance_data, aes(x=age, y=charges, color=insurance_data[[cat_col]])) +
geom_point(alpha=0.5) +
geom_smooth(method="lm", se=FALSE) +
labs(title=paste("Relationship between Age and Charges by", cat_col)) +
theme_light()
print(plot)
}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
head(insurance_data)
## age sex bmi children smoker region charges group
## 415 19 female 35.150 0 no northwest 2134.901 non-smoker & age<43
## 463 62 female 38.095 2 no northeast 15230.324 non-smoker & age>=43
## 179 46 female 28.900 2 no southwest 8823.279 non-smoker & age>=43
## 526 18 female 33.880 0 no southeast 11482.635 non-smoker & age<43
## 195 18 male 34.430 0 no southeast 1137.470 non-smoker & age<43
## 938 39 female 24.225 5 no northwest 8965.796 non-smoker & age<43
## group2 group3
## 415 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 463 non-smoker & bmi>=30 non-smoker & age>=43
## 179 non-smoker & bmi<30 non-smoker & age>=43
## 526 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 195 non-smoker & bmi>=30 non-smoker & age<43 & 0children
## 938 non-smoker & bmi<30 non-smoker & age<43 & 0children
str(insurance_data)
## 'data.frame': 936 obs. of 10 variables:
## $ age : int 19 62 46 18 18 39 41 62 20 24 ...
## $ sex : Factor w/ 2 levels "female","male": 1 1 1 1 2 1 1 2 2 2 ...
## $ bmi : num 35.1 38.1 28.9 33.9 34.4 ...
## $ children: int 0 2 2 0 0 5 3 0 0 0 ...
## $ smoker : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 2 ...
## $ region : Factor w/ 4 levels "northeast","northwest",..: 2 1 4 3 3 2 4 3 4 1 ...
## $ charges : num 2135 15230 8823 11483 1137 ...
## $ group : Factor w/ 4 levels "non-smoker & age<43",..: 1 2 2 1 1 1 1 2 3 4 ...
## $ group2 : Factor w/ 4 levels "non-smoker & bmi<30",..: 2 2 1 2 2 1 2 2 3 4 ...
## $ group3 : Factor w/ 4 levels "non-smoker & age<43 & 0children",..: 1 2 2 1 1 1 1 2 3 4 ...
# Drop column "B"
drop_variable_for_feature <- c("group", "group2", "group3")
insurance_data <- insurance_data[, !colnames(insurance_data) %in% drop_variable_for_feature]
#######################
## Feature selection ##
#######################
## Lasso feature selection
# Convert factor variables to one-hot encoding
train_data_encoded <- model.matrix(~ . - 1, data = insurance_data)
# Convert data to matrix form
exclude_variable_name <- "charges" # Replace with the actual variable name
x <- as.matrix(train_data_encoded[, !colnames(train_data_encoded) %in% exclude_variable_name]) # predictors
y <- train_data$charges # response
# Apply Lasso
lasso_model <- glmnet(x, y, alpha = 1) # alpha=1 indicates Lasso
# Cross-validation for lambda selection
cv.lasso <- cv.glmnet(x, y, alpha = 1)
plot(cv.lasso)
optimal_lambda <- cv.lasso$lambda.min
# Coefficients at optimal lambda
coef(lasso_model, s = optimal_lambda)
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -11706.2235
## age 232.7559
## sexfemale .
## sexmale .
## bmi 337.8689
## children 572.0103
## smokeryes 23603.4315
## regionnorthwest .
## regionsoutheast -92.2366
## regionsouthwest -331.1749
## Forward selection
reg.fwd=regsubsets(charges~.,data=insurance_data,method="forward")
summary(reg.fwd)$adjr2
## [1] 0.6261174 0.7104032 0.7412801 0.7454610 0.7456105 0.7457828 0.7457844
## [8] 0.7456120
summary(reg.fwd)$rss
## [1] 52510373307 40629211034 36258409342 35634203991 35575017163 35512689427
## [7] 35474244642 35460055213
summary(reg.fwd)$bic
## [1] -908.1678 -1141.4321 -1241.1222 -1250.5346 -1245.2489 -1240.0486 -1234.2208
## [8] -1227.7537
par(mfrow=c(1,3))
bics<-summary(reg.fwd)$bic
plot(1:8,bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==min(bics))
points(index,bics[index],col="red",pch=10)
adjr2<-summary(reg.fwd)$adjr2
plot(1:8,adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)
rss<-summary(reg.fwd)$rss
plot(1:8,rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)
coef(reg.fwd,4)
## (Intercept) age bmi children smokeryes
## -12811.3853 240.8897 354.1445 667.6838 23925.0815
print("")
## [1] ""
coef(reg.fwd,8)
## (Intercept) age sexmale bmi children
## -12473.5434 240.0827 -248.8674 369.9369 673.3514
## smokeryes regionnorthwest regionsoutheast regionsouthwest
## 23938.2867 -595.1616 -948.1999 -1116.6608
print("")
## [1] ""
##################################
## Building Interpretable Model ##
##################################
# Creating all the different types of linear models that don't add too much complexity based off of residuals vs fitted as well as normal Q-Q plots assumptions are not met for any model configuration.
simple_linear_model <- lm(charges ~ age + sex + bmi + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(simple_linear_model)
log_linear_model <- lm(log(charges) ~ age + sex + bmi + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(log_linear_model)
log_log_linear_model <- lm(log(charges) ~ log(age) + sex + log(bmi) + children + smoker, data = insurance_data)
par(mfrow=c(2,2))
plot(log_log_linear_model)
interaction_linear_model <- lm(charges ~ age * smoker + sex + bmi + children, data = insurance_data)
par(mfrow=c(2,2))
plot(interaction_linear_model)
summary(interaction_linear_model)
##
## Call:
## lm(formula = charges ~ age * smoker + sex + bmi + children, data = insurance_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11428.5 -3062.9 -906.6 1665.3 29891.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12629.35 1197.94 -10.543 < 2e-16 ***
## age 237.27 16.48 14.397 < 2e-16 ***
## smokeryes 23312.70 1480.77 15.744 < 2e-16 ***
## sexmale -229.43 408.76 -0.561 0.575
## bmi 356.53 33.33 10.698 < 2e-16 ***
## children 668.71 165.57 4.039 5.82e-05 ***
## age:smokeryes 16.58 36.25 0.458 0.647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6192 on 929 degrees of freedom
## Multiple R-squared: 0.7467, Adjusted R-squared: 0.7451
## F-statistic: 456.4 on 6 and 929 DF, p-value: < 2.2e-16
#######################################
## Simple model for interpratability ##
#######################################
#PAIRED BOOTSTRAP
print("Paired Bootstrap")
## [1] "Paired Bootstrap"
boot.p<-paired.boot(charges ~ age + bmi + children + smoker,
B=3000,
seed=1234,
data = insurance_data)
t(apply(boot.p$bootEstParam,2,quantile,probs=c(.025,.975)))
## 2.5% 97.5%
## (Intercept) -15184.2133 -10444.0836
## age 213.0127 268.0117
## bmi 285.2497 428.0634
## children 365.1938 973.1746
## smokeryes 22583.3657 25238.2086
simple <- lm(charges ~ age + bmi + children + smoker, data = insurance_data)
summary(simple)
##
## Call:
## lm(formula = charges ~ age + bmi + children + smoker, data = insurance_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11304.4 -3089.2 -897.7 1721.3 29887.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12811.39 1164.39 -11.003 < 2e-16 ***
## age 240.89 14.67 16.423 < 2e-16 ***
## bmi 354.14 33.13 10.689 < 2e-16 ***
## children 667.68 165.34 4.038 5.83e-05 ***
## smokeryes 23925.08 491.07 48.721 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6187 on 931 degrees of freedom
## Multiple R-squared: 0.7465, Adjusted R-squared: 0.7455
## F-statistic: 685.6 on 4 and 931 DF, p-value: < 2.2e-16
###################
## Complex model ##
###################
print("Paired Bootstrap with interaction")
## [1] "Paired Bootstrap with interaction"
boot.p<-paired.boot(charges ~ age:smoker + bmi:smoker + children,
B=3000,
seed=1234,
data = insurance_data)
t(apply(boot.p$bootEstParam,2,quantile,probs=c(.025,.975)))
## 2.5% 97.5%
## (Intercept) -8948.30448 -5383.5289
## children 381.20252 889.4906
## age:smokerno 251.67237 302.0907
## age:smokeryes 98.19227 203.7076
## smokerno:bmi 72.00424 176.5724
## smokeryes:bmi 1003.82385 1152.1987
complex <- lm(charges ~ age:smoker + bmi:smoker + children, data = insurance_data)
summary(complex)
##
## Call:
## lm(formula = charges ~ age:smoker + bmi:smoker + children, data = insurance_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10610.4 -1985.7 -1032.1 93.2 30199.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7135.39 963.51 -7.406 2.92e-13 ***
## children 629.39 137.57 4.575 5.40e-06 ***
## age:smokerno 277.25 13.49 20.551 < 2e-16 ***
## age:smokeryes 151.31 24.04 6.295 4.74e-10 ***
## smokerno:bmi 121.10 29.07 4.166 3.38e-05 ***
## smokeryes:bmi 1077.72 37.37 28.841 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5147 on 930 degrees of freedom
## Multiple R-squared: 0.8247, Adjusted R-squared: 0.8238
## F-statistic: 875.3 on 5 and 930 DF, p-value: < 2.2e-16
######################
## Evaluation Model ##
######################
evaluate_model <- function(model, test_data) {
# 1. Predict on test data
test_predictions <- predict(model, newdata = test_data)
# 2. Calculate MSE
actual_values <- test_data$charges # replace 'charges' if your response variable name differs
errors <- test_predictions - actual_values
MSE <- mean(errors^2)
# 3. R squared and Adjusted R squared
SST <- sum((actual_values - mean(actual_values))^2)
SSR <- sum(errors^2)
R2 <- 1 - (SSR / SST)
# Adjusted R^2
n <- length(actual_values)
p <- length(coefficients(model)) - 1 # minus one to exclude the intercept
Adj_R2 <- 1 - ((1 - R2) * (n - 1) / (n - p - 1))
# 4. AIC and BIC
AIC_val <- AIC(model, k = 2) # k=2 by default for linear regression
BIC_val <- BIC(model)
# Return the metrics
metrics <- list(
MSE = MSE,
R2 = R2,
Adj_R2 = Adj_R2,
AIC = AIC_val,
BIC = BIC_val
)
return(metrics)
}
metrics.simple <- evaluate_model(simple, test_data)
metrics.complex <- evaluate_model(complex, test_data)
print("Simple")
## [1] "Simple"
metrics.simple
## $MSE
## [1] 33979472
##
## $R2
## [1] 0.7529387
##
## $Adj_R2
## [1] 0.7504495
##
## $AIC
## [1] 19006.09
##
## $BIC
## [1] 19035.14
print("Complex")
## [1] "Complex"
metrics.complex
## $MSE
## [1] 24590140
##
## $R2
## [1] 0.8212076
##
## $Adj_R2
## [1] 0.8189501
##
## $AIC
## [1] 18662.81
##
## $BIC
## [1] 18696.7
train_data2 = insurance_data
validation_data2 = test_data
# train_data2$IsSouth = NA
# train_data2$IsSouth = ifelse(grepl("south", train_data1$region, ignore.case = TRUE), 1, 0)
train_data2$smoker = ifelse(grepl("yes", insurance_data$smoker, ignore.case = TRUE), 1, 0)
# validation_data2$IsSouth = NA
# validation_data2$IsSouth = ifelse(grepl("south", validation_data1$region, ignore.case = TRUE), 1, 0)
validation_data2$smoker = ifelse(grepl("yes", test_data$smoker, ignore.case = TRUE), 1, 0)
tuneGrid <- expand.grid(k = c(1:10, 20, 30))
fitControl <- trainControl(method = "cv", number = 10)
knn_fit <- train(charges ~ age + bmi + children + smoker, data = train_data2, method = "knn",
trControl = fitControl, tuneGrid=tuneGrid)
print(knn_fit)
## k-Nearest Neighbors
##
## 936 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 842, 842, 844, 844, 842, 840, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 12711.35 0.2066316 6796.781
## 2 11696.10 0.1874317 7301.533
## 3 11507.25 0.1817744 7572.681
## 4 11398.63 0.1742334 7725.796
## 5 11466.58 0.1576801 7971.900
## 6 11479.73 0.1492202 8163.695
## 7 11464.86 0.1476428 8239.592
## 8 11526.00 0.1342101 8327.375
## 9 11561.01 0.1247275 8399.734
## 10 11551.57 0.1238090 8481.278
## 20 11633.05 0.1068564 8970.151
## 30 11649.47 0.1021759 9086.045
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
# from the result, we see that K = 6 is the best in terms of RMSE
predictors = c("age", "bmi", "children", "smoker")
response = "charges"
trainx = train_data2[, predictors]
trainy = train_data2$charges
testx = validation_data2[, predictors]
testy = validation_data2$charges
knn_model <- knnreg(trainx, trainy, k = 9)
plot(testy, predict(knn_model, testx))
test_result = predict(knn_model, testx)
MSE = mean(test_result - testy)^2
print("MSE")
## [1] "MSE"
print(MSE)
## [1] 75043.42